Libraries to load before exploring data
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidyr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readxl)
Loading Dataset and cleaning names to snake_case
super_store_data <- readxl::read_excel("Sample - EU Superstore.xls")
super_store_data |>
janitor::clean_names() -> super_store_data
Summary_statistics
summary(super_store_data$sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.955 49.462 119.355 293.809 320.709 7958.580
summary(super_store_data$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3059.82 1.32 14.22 37.28 48.51 3979.08
Relation between Sales and Profit
plotly::ggplotly(
ggplot(super_store_data, aes(x= sales, y= profit))+
geom_hex()+
geom_smooth()+
labs(x="Sales", y="Profit", title = "Relation Between Sales and Profit")
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Realtion between Profit and Discount
plotly::ggplotly(
ggplot(super_store_data, aes(x=profit, y= discount))+
geom_hex()+
geom_smooth()+
labs(x="Profit", y="Dicount", title = "Relation Between Profit and Discount")
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Customers count across Segments
plotly::ggplotly(
ggplot(super_store_data, aes(x= segment))+
geom_bar(fill= "steelblue")+
labs(x= "Segments", title = "Customers count across Segments",
y= "Frequency")
)
Shipment-Mode Preference
plotly::ggplotly(
ggplot(super_store_data, aes(x= ship_mode))+
geom_bar(fill= "palegreen", col= "black")+
labs(x= "Shipment mode", title = "Customers Shipment Mode preference",
y= "Frequency")
)
Distribution of customers across country
plotly::ggplotly(
ggplot(super_store_data |>
group_by(country) |> count(),
aes(x= country, y=n, fill= country))+
geom_bar(stat = "identity")+
labs(x="Country", fill="Labels", y="Number of Customers",
title = "Distribution of Customers across Country")+
theme(axis.text.x = element_text(angle = 90))
)
Hot selling Sub-Categories in Categories
options(scipen = 999)
plotly::ggplotly(
ggplot(super_store_data |>
group_by(category, sub_category) |>
summarise(sales=sum(sales)),
aes(x=category, y= sales, fill= sub_category))+
geom_bar(stat = "identity", col= "black")+
labs(x= "Category", y= "Sales Distribution", fill="Sub-Category",
title = "Hot selling Sub-Categories in Categories")
)
## `summarise()` has grouped output by 'category'. You can override using the `.groups` argument.
Popular Sub-Categories in Categories
busin_orient_cat_subcat <- super_store_data |>
group_by(category, sub_category) |>
count()
options(scipen = 999)
plotly::ggplotly(
ggplot(busin_orient_cat_subcat, aes(x= category, y= n, fill= sub_category))+
geom_bar(stat = "identity", col= "lightyellow")+
labs(x= "Category", y="Frequency", fill="Sub-Category",
title = "Popular Sub-Categories in Categrory")
)
Sales and Profit by Sub-Category
sales_profit_by_sub_category <- super_store_data |>
group_by(sub_category) |>
summarise(sales=sum(sales),
profit= sum(profit)) |>
pivot_longer(sales:profit,
names_to = "key",
values_to = "value")
options(scipen = 999)
plotly::ggplotly(
ggplot(sales_profit_by_sub_category, aes(x= sub_category, y= value,
fill= key))+
geom_bar(stat = "identity", position = "dodge")+
theme(axis.text.x = element_text(angle = 90)) +
labs(x= "Sub-Category", y= "Total Sales and Profits",
title = "Sales and Profit by Sub-Category")
)
Gross-Profit by Sub-Category
plotly::ggplotly(
ggplot(super_store_data |>
group_by(sub_category) |>
summarise(gross_profit= sum(profit)),
aes(x= sub_category, y= gross_profit))+
geom_col(fill= "lightyellow", col= "black")+
theme(axis.text.x = element_text(angle = 90)) +
labs(x= "Sub-Category", y= "Gross-Profit",
title = "Gross-Profit by Sub-Category")
)
Gross-Profit by Category
plotly::ggplotly(
ggplot(super_store_data |>
group_by(category) |>
summarise(gross_profit= sum(profit)),
aes(x= category, y= gross_profit))+
geom_col(fill= "lightyellow", col= "black")+
theme(axis.text.x = element_text(angle = 90)) +
labs(x= "Category", y= "Gross-Profit",
title = "Gross-Profit by Category")
)
Profit by category and subcategory
options(scipen = 999)
plotly::ggplotly(
ggplot(super_store_data |>
group_by(category,sub_category) |>
summarise(profit= sum(profit)),
aes(x= category, y= profit, fill= sub_category))+
geom_col(col="black") +
labs(x= "Category", y= "Profit", fill= "Sub-Category",
title = "Profit by Category and Sub-Category")
)
## `summarise()` has grouped output by 'category'. You can override using the `.groups` argument.
Sales Trend Over Years
super_store_data |>
mutate(year= lubridate::year(order_date)) |>
group_by(year) |>
summarise(revenue= sum(sales))-> yearly_sales
options(scipen = 999)
plotly::ggplotly(
ggplot(yearly_sales, aes(x=year, y= revenue))+
geom_point(col="yellow", size = 2)+
geom_line(col= "blue4") +
labs(x= "Years", y= "Revenue", title = "Sales Trend Over Years")
)
Monthly Sales Trend over Years
super_store_data |>
mutate(month= lubridate::month(order_date, label = TRUE),
year= lubridate:: year(order_date)) |>
group_by(month, year) |>
summarise(sales= sum(sales)) -> monthly_sales_by_year
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(monthly_sales_by_year, aes(x= month, y= sales, color= as.factor(year),
group= as.factor(year)))+
geom_line()+
geom_point(col= "black", size= 1) +
labs(x= "Months", y= "Sales Value", color= "Years",
title = "Monthly Sales Trend over Years")
)
Monthly Purchase Frequency across years
super_store_data |>
mutate(month= lubridate::month(order_date, label = TRUE),
year= lubridate::year(order_date)) |>
group_by(month, year) |>
summarise(n_transactions= n_distinct(order_id)) ->monthly_p_freq_ov_years
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(monthly_p_freq_ov_years, aes(x= month, y= n_transactions,
color= as.factor(year),
group= as.factor(year)))+
geom_line()+
geom_point(col= "black", size=1)+
labs(x= "Months", y="Number of Transactions", color= "Years",
title = "Montly Purchase Frequency Across Years")
)
Monthly Sales Trend Across Years Region Wise
super_store_data |>
mutate(month= lubridate::month(order_date, label = TRUE),
year= lubridate::year(order_date)) |>
group_by(month,year,region) |>
summarise(revenue= sum(sales)) ->m_y_sales_trend_region
## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(m_y_sales_trend_region, aes(x= month, y= revenue,
color= as.factor(year),
group= as.factor(year)))+
geom_line()+
geom_point(col= "black", size=1)+
facet_wrap(~region, nrow = 3)+
theme(axis.text.x = element_text(angle = 90))+
labs(x= "Years", y="Sales", color= "Year",
title = "Montly Sales Trend Across Years Region Wise")
)
Average Life-Span of Customers with Business in Months
super_store_data |>
group_by(customer_id) |>
summarise(monthly_lifespan= difftime(max(order_date), min(order_date), units = "days")) |>
mutate(monthly_lifespan= round(as.integer(monthly_lifespan/30))) -> cus_eng_monthly
plotly::ggplotly(
ggplot(cus_eng_monthly |>
filter(monthly_lifespan > 0),
aes(x= monthly_lifespan))+
geom_bar(fill= "steelblue", col= "lightyellow")+
labs(x= "Monthly Life-Span", y= "Number of Customers",
title = "Average Life-Span of Customers with Business in Months")
)
Dominant Sub-Categories Over Years (Transactions)
super_store_data |>
mutate(year= lubridate::year(order_date)) |>
group_by(sub_category, year) |>
summarise(n_transactions= n_distinct(order_id)) -> dom_sbcat_o_years
## `summarise()` has grouped output by 'sub_category'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(dom_sbcat_o_years, aes(x= sub_category, y= n_transactions,
color= as.factor(year),
group= as.factor(year)))+
geom_line()+
labs(x= "Number of Transactions", y= "Sub-Category",
title = "Dominant Sub-Categories Over Years (Transactions)") +
theme(axis.text.x = element_text(angle = 45))
)
Dominant Category Over Years (Transactions)
super_store_data |>
mutate(year= lubridate::year(order_date)) |>
group_by(category, year) |>
summarise(n_transactions= n_distinct(order_id)) -> dom_cat_o_years
## `summarise()` has grouped output by 'category'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(dom_cat_o_years, aes(x= category, y= n_transactions,
color= as.factor(year), group= as.factor(year))) +
geom_line() +
geom_point(col= "black", size = 1)+
labs(x= "Category", y= "Number of Transactions", color= "Year",
title = "Dominant Sub-Categories Over Years (Transactions)")
)
Dominant Segment Over Years (Revenue)
super_store_data |>
mutate(year= lubridate::year(order_date)) |>
group_by(segment, year) |>
summarise(revenue= sum(sales)) -> dom_seg_o_years
## `summarise()` has grouped output by 'segment'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(dom_seg_o_years, aes(x= segment, y= revenue, color= as.factor(year),
group= as.factor(year))) +
geom_line()+
geom_point(col= "black", size= 1) +
labs(x= "Segment", y= "Revenue",
title = "Dominant Segment Over Years (Revenue)")
)
Dominant Country Over Years (Profit)
super_store_data |>
mutate(year= lubridate::year(order_date)) |>
group_by(country, year) |>
summarise(profit= sum(profit)) -> dom_country_o_years
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(dom_country_o_years, aes(x= country, y= profit, col= as.factor(year),
group= as.factor(year)))+
geom_line()+
geom_point(col= "black", size= 1)+
labs(x= "Country", y= "Profit",
title = "Dominant Country Over Years (Profit)")+
theme(axis.text.x = element_text(angle = 90))
)
Average Basket size of Sub-Categories
super_store_data |>
mutate(n_transactions= n_distinct(order_id)) |>
group_by(sub_category) |>
summarise(quantity= sum(quantity),
n_transactions= n_transactions) |>
distinct() |>
mutate(basket_size= quantity/n_transactions) -> avg_basket_size_subcat
## `summarise()` has grouped output by 'sub_category'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(avg_basket_size_subcat, aes(x= basket_size, y= sub_category))+
geom_col(fill="darkseagreen", col="black")+
labs(x= "Average Basket Size", y= "Sub-Category",
title = "Average Basket size of Sub-Categories")
)
Average Basket size of Sub-Categories By Region
super_store_data |>
mutate(n_transactions= n_distinct(order_id)) |>
group_by(sub_category,region) |>
summarise(quantity= sum(quantity),
n_transactions= n_transactions) |>
mutate(basket_size= quantity/n_transactions) ->avg_basket_size_region
## `summarise()` has grouped output by 'sub_category', 'region'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(avg_basket_size_region, aes(x= basket_size, y= sub_category))+
geom_col(fill="darkseagreen") +
facet_wrap(~region) +
labs(x= "Average Basket Size", y= "Sub-Category",
title = "Average Basket size of Sub-Categories By Region")
)
Average Basket size by product
super_store_data |>
mutate(n_transactions= n_distinct(order_id)) |>
filter(sub_category == "Tables") |>
group_by(product_id,sub_category) |>
summarise(quantity= sum(quantity),
n_transactions= n_transactions) |>
distinct() |>
mutate(basket_size= quantity/n_transactions) ->avg_basket_size_product
## `summarise()` has grouped output by 'product_id', 'sub_category'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(avg_basket_size_product, aes(x= basket_size, y= product_id)) +
geom_col(fill= "steelblue", col= "black") +
facet_wrap(~ sub_category) +
labs(x= "Average Basket-Size", y= "Product-ID",
title = "Average Basket-Size by Product")
)
Purchase Frequency Across Week-Days in Months
super_store_data |>
mutate(month= lubridate::month(order_date, label = TRUE),
week_day=weekdays(order_date)) |>
group_by(month, week_day) |>
summarise(n_transactions= n_distinct(order_id)) ->n_trans_week_days
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
plotly::ggplotly(
ggplot(n_trans_week_days, aes(x= month, y= week_day, fill= n_transactions))+
geom_tile()+
labs(x= "Month", y= "Week-Days", fill = "Purchase Frequency",
title = "Purchase Frequency Across Week-Days in Months")
)
RFM-Analysis
date_today <- lubridate::ymd("2019/04/10")
super_store_data |>
group_by(customer_id) |>
summarise(total_revenue= sum(sales),
n_transactions= n_distinct(order_id),
last_purchase_date= max(order_date)) |>
mutate(days_since_last_purchase=
difftime(date_today, last_purchase_date, units = "days")) |>
filter(total_revenue > 0) |>
mutate(monetary_quintile = cut2(total_revenue, g =5),
frequency_quintile = cut2(n_transactions, g = 5),
recency_quintile = cut2(days_since_last_purchase, g = 5)) |>
mutate(recency_score = as.integer(recency_quintile),
frequency_score = as.integer(frequency_quintile),
monetary_score = as.integer(monetary_quintile)) |>
mutate(recency_score = dense_rank(desc(recency_score))) |>
mutate(RFM= recency_score * 100 + frequency_score * 10 + monetary_score) |>
mutate(segments= ifelse(recency_score >= 4 & frequency_score >= 3 & monetary_score >= 4, "Champions",
ifelse((recency_score >= 4) & (frequency_score <= 2) & (monetary_score >= 4), "High Spending New Customers",
ifelse((recency_score >= 4) & (frequency_score >= 4) & (monetary_score == 3), "Average Spending Champions",
ifelse((recency_score >= 2 & recency_score <= 4) & (frequency_score >= 3 & frequency_score <= 5) & (monetary_score >= 4), "Loyal Customers",
ifelse((recency_score >= 3) & (frequency_score >= 1 & frequency_score <= 3) & (monetary_score >= 1 & monetary_score <= 3), "Potential Loyalists",
ifelse((recency_score >= 4 & recency_score <= 5) & (frequency_score < 2) & (monetary_score < 2), "New Customers",
ifelse((recency_score >= 3 & recency_score <= 4) & (frequency_score < 2) & (monetary_score < 2), "Promising",
ifelse((recency_score >= 3 & recency_score <= 4) & (frequency_score >= 2 & frequency_score <= 4) & (monetary_score >= 3 & monetary_score <= 5), "Need attention",
ifelse((recency_score >= 2 & recency_score <= 3) & (frequency_score < 3) & (monetary_score < 3), "About to sleep",
ifelse((recency_score < 3) & (frequency_score >=2 & frequency_score <= 5) & (monetary_score >= 2 & monetary_score <= 5), "At risk",
ifelse((recency_score < 2) & (frequency_score >= 4 & frequency_score <= 5) & (monetary_score >= 4 & monetary_score <= 5), "Can't loose them",
ifelse((recency_score >= 2 & recency_score <=3) & (frequency_score >= 2 & frequency_score <= 3) & (monetary_score >= 2 & monetary_score <= 3), "Hibernating",
ifelse((recency_score <= 2) & (frequency_score <= 2) & (monetary_score >= 4), "High Value Lost",
ifelse((recency_score < 2) & (frequency_score <= 3) & (monetary_score <= 2), "Low Value Lost",
ifelse((recency_score == 3) & (frequency_score < 2) & (monetary_score >= 4), "High Spending New Customers",
ifelse((recency_score <= 2) & (frequency_score < 2) & (monetary_score == 3), "Average Spending Lost",
ifelse((recency_score <= 2) &(frequency_score <= 4) &(monetary_score == 1), "Low Value Hibernating",
ifelse((recency_score <= 3) &(frequency_score >= 4) &(monetary_score <=3), "Average Spending Need Attention", "Low Spending Champions"))))))))))))))))))) -> rfm
Number of Customers in Segments
plotly::ggplotly(
ggplot(rfm, aes(x= segments, fill= segments))+
geom_bar(col= "black")+
theme(axis.text.x = element_text(angle = 45))
)
Calculating CLTV
# purchase value
super_store_data |>
group_by(order_id) |>
summarise(purchase_value= sum(sales)) |>
ungroup() |>
summarise(avg_purch_value= mean(purchase_value)) -> avg_purchase_value
# purchase frequency
super_store_data |>
mutate(month= lubridate::month(order_date),
year= lubridate::year(order_date)) |>
group_by(customer_id, month,year) |>
summarise(purchase_frequency= n_distinct(order_id)) |>
ungroup() |>
summarise(avg_purch_frequency= mean(purchase_frequency)) ->avg_purchase_frequency
## `summarise()` has grouped output by 'customer_id', 'month'. You can override using the `.groups` argument.
#Gross margin
super_store_data |>
mutate(gross_margin= sum(profit)/ sum(sales))
## # A tibble: 10,000 x 21
## row_id order_id order_date ship_date ship_mode customer_id
## <dbl> <chr> <dttm> <dttm> <chr> <chr>
## 1 1 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 2 2 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 3 3 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 4 4 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 5 5 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 6 6 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 7 7 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 8 8 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840
## 9 9 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840
## 10 10 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840
## # ... with 9,990 more rows, and 15 more variables: customer_name <chr>,
## # segment <chr>, city <chr>, state <chr>, country <chr>, region <chr>,
## # product_id <chr>, category <chr>, sub_category <chr>, product_name <chr>,
## # sales <dbl>, quantity <dbl>, discount <dbl>, profit <dbl>,
## # gross_margin <dbl>
gross_margin <- 0.1268953
# life span
super_store_data |>
mutate(month= lubridate::month(order_date)) |>
group_by(customer_id) |>
summarise(lifespan= difftime(max(order_date), min(order_date), units = "days")) |>
mutate(lifespan= round(as.integer(lifespan/30))) |>
summarise(avg_life_span= mean(lifespan)) -> avg_life_span
CLTV= (avg_purchase_value$avg_purch_value) * (gross_margin) * (avg_purchase_frequency$avg_purch_frequency) * (avg_life_span$avg_life_span)
CLTV
## [1] 2845.339
JOINS Ship-Mode Preference by Segments (JOINS)
left_join(rfm, super_store_data) -> joined
## Joining, by = "customer_id"
plotly::ggplotly(
ggplot(joined |>
group_by(segments) |>
count(ship_mode), aes(x= segments, y= n,
fill= ship_mode))+
geom_col(position = "fill", col= "black")+
theme(axis.text.x = element_text(angle = 90))+
labs(x= "Customer Segments", y= "Count", fill= "Shipment-Mode",
title = "Ship-Mode Preference by Segments")+
coord_flip()
)
Segments Count Across Sub-Categories
plotly::ggplotly(
ggplot(joined |>
group_by(sub_category) |>
count(segments), aes(x= segments, y= n, fill= sub_category))+
geom_col(position = "fill", col="black") +
theme(axis.text.x = element_text(angle = 90)) +
labs(x= "Customer Segments", y= "Count", fill= "Sub-Category",
title = "Segments Count Across Sub-Categories")
)